import pandas as pd
import numpy as np


np.random.seed(42)
left = pd.DataFrame({'key': ['a', 'b', 'c', 'd', 'e'], 
                     'dim1':['grp1', 'grp2','grp2','grp3','grp3'],
                     'values': np.random.normal(65,10,5)})
print(left)

  key  dim1     values
0   a  grp1  69.967142
1   b  grp2  63.617357
2   c  grp2  71.476885
3   d  grp3  80.230299
4   e  grp3  62.658466


np.random.seed(42)
right = pd.DataFrame({'key': ['a','b','c','x','y','z'], 
                      'dim2':['aaa', 'aaa','bbb','bbb', 'ccc', 'aaa'],
                      'values': np.random.randint(100, 150, 6)})
print(right)

  key dim2  values
0   a  aaa     138
1   b  aaa     128
2   c  bbb     114
3   x  bbb     142
4   y  ccc     107
5   z  aaa     120


left


right


pd.merge(left,right,how='inner',on='key',)


pd.merge(left, right, how='inner', right_index=True, left_index=True, suffixes=('_left', '_right'))


pd.merge(left,right,how='left',on='key')


df= pd.merge(left,right,how='left',on='key')


df.isna().sum()

key         0
dim1        0
values_x    0
dim2        2
values_y    2
dtype: int64


pd.merge(left,right, left_index=True, right_index=True, how='left')


pd.merge(left,right,how='right',on='key')


pd.merge(left,right,how='outer',on='key')


pd.merge(left,right, how='outer',left_index=True, right_index=True)


type([1,2])

list


pd.concat([left, right], axis=0)


pd.concat([left, right], axis=0, ignore_index=True)


df= pd.concat([left, right], axis=0, keys=['left', 'right'])
print(df)

        key  dim1      values dim2
left  0   a  grp1   69.967142  NaN
      1   b  grp2   63.617357  NaN
      2   c  grp2   71.476885  NaN
      3   d  grp3   80.230299  NaN
      4   e  grp3   62.658466  NaN
right 0   a   NaN  138.000000  aaa
      1   b   NaN  128.000000  aaa
      2   c   NaN  114.000000  bbb
      3   x   NaN  142.000000  bbb
      4   y   NaN  107.000000  ccc
      5   z   NaN  120.000000  aaa


s= 'grp2'
left.loc[left.dim1== s]


df.loc['left',:]


df.loc['right']


df = pd.concat([left,right],axis=1)
df


pd.concat([left, right], axis=1, join='outer')


# 3 series creatd programmatically 
s1= pd.Series(data=[True,False,True,True,False], index=['a', 'b', 'c', 'd','e'])
s2= pd.Series(data=[100.00,110.00,120.00,130.00,75.00], index=['a', 'b', 'c', 'd','e'])
s3= pd.Series(data=['A','B','C','B','A'], index=['a', 'b', 'c', 'd','e'])
# A list of the 3 series
lst_s= [s1,s2,s3]


# form a dataframe using the series as columns
df= pd.concat(lst_s, axis=1)
df.columns= ['bool_flag', 'dollars', 'category']
df


sport_data = {'Team': ['Nets', 'Nets', 'Clippers', 'Clippers', 'Warriors',
         'Warriors', 'Warriors', 'Warriors', 'Nets', 'Lakers', 'Lakers', 'Nets'],
         'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
         'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,
                  2015,2017],
         'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}


df=pd.DataFrame(sport_data)
print (df)

        Team  Rank  Year  Points
0       Nets     1  2014     876
1       Nets     2  2015     789
2   Clippers     2  2014     863
3   Clippers     3  2015     673
4   Warriors     3  2014     741
5   Warriors     4  2015     812
6   Warriors     1  2016     756
7   Warriors     1  2017     788
8       Nets     2  2016     694
9     Lakers     4  2014     701
10    Lakers     1  2015     804
11      Nets     2  2017     690


df.groupby(['Team'])['Points'].sum()

Team
Clippers    1536
Lakers      1505
Nets        3049
Warriors    3097
Name: Points, dtype: int64


gf= df.groupby(['Team'])['Points'].sum()
print()
print('What data structure is it? \n', type(gf))
print()
print(' What does the index look like? \n', gf.index)
print()
print('Values:',gf.values)
print()
print('Afer reset_index')
af= gf.reset_index()
print()
print('What data structue now? \n', type(af))
print()
print(' What does the index look like? \n', af.index)
print()
print(' What are the columns? \n', af.columns)

What data structure is it? 
 <class 'pandas.core.series.Series'>

 What does the index look like? 
 Index(['Clippers', 'Lakers', 'Nets', 'Warriors'], dtype='object', name='Team')

Values: [1536 1505 3049 3097]

Afer reset_index

What data structue now? 
 <class 'pandas.core.frame.DataFrame'>

 What does the index look like? 
 RangeIndex(start=0, stop=4, step=1)

 What are the columns? 
 Index(['Team', 'Points'], dtype='object')

af


df.groupby(['Year', 'Team'])['Points'].sum()

Year  Team    
2014  Clippers    863
      Lakers      701
      Nets        876
      Warriors    741
2015  Clippers    673
      Lakers      804
      Nets        789
      Warriors    812
2016  Nets        694
      Warriors    756
2017  Nets        690
      Warriors    788
Name: Points, dtype: int64


df.loc[df.Rank==1].groupby(['Year', 'Team'])['Points'].sum()

Year  Team    
2014  Nets        876
2015  Lakers      804
2016  Warriors    756
2017  Warriors    788
Name: Points, dtype: int64


jj=left.merge(right, left_on='key', right_on='key')


jj.loc[jj.dim2=='aaa'].groupby(['dim1','dim2'])['values_x', 'values_y'].sum()

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  """Entry point for launching an IPython kernel.


df.groupby(['Team'])['Points'].agg([np.min, np.max, np.mean,np.std, np.size])


df.groupby(['Team']).agg(min_points= ('Points', np.min),max_points= ('Points', np.max), mean_Rank= ('Rank', np.mean))


pd.crosstab(index=df.Team, columns=df.Year)


pd.crosstab(index=df['Team'], columns=df['Year'], values=df['Points'], aggfunc='sum')


df.pivot(index='Team', columns='Year', values='Points')


df.groupby(['Year', 'Team'])['Points'].sum().reset_index().pivot(index='Team', columns='Year', values='Points')

df


def f(x):
  if x >= 806:
    return('High')
  else:
    return('Low')


f(900)

'High'


df['Scr_flg']= df.Points.apply(lambda x: f(x))

df


def g(x,y):
  if (x==1) & (y=='High'):
    return('11')
  elif (x!=1) & (y=='High'):
    return('01')
  else:
    return('00')


g(4, 'Low')

'00'


df['rank_score_flg']=df.apply(lambda x: g(x['Rank'], x['Scr_flg']), axis=1)

df


df.sum()

Team              NetsNetsClippersClippersWarriorsWarriorsWarrio...
Rank                                                             26
Year                                                          24182
Points                                                         9187
Scr_flg                     HighLowHighLowLowHighLowLowLowLowLowLow
rank_score_flg                             110001000001000000000000
dtype: object


df.median( axis=0)

Rank                 2.0
Year              2015.0
Points             772.0
rank_score_flg       0.0
dtype: float64


df.groupby('Team')['Points'].agg('mean')

Team
Clippers    768.00
Lakers      752.50
Nets        762.25
Warriors    774.25
Name: Points, dtype: float64


df.groupby('Team')['Points'].agg('describe')


import matplotlib.pyplot as plt


plt.figure(figsize=(8,8))
plt.scatter(df.Rank, df.Points, marker='o', c='orange')
plt.xlabel('Team Rank')
plt.ylabel('Team score')
plt.title('Scatter Plot of Team Score versus Rank \n')
plt.show()


af.plot('Team', 'Points', kind='bar', ylabel='Total Points', xlabel='Teams', color='red', figsize=(10,10),\
        title= 'Bar Plot of Total Score versus Team \n')

<matplotlib.axes._subplots.AxesSubplot at 0x7fc25ea3ec50>


af.plot('Team', 'Points', kind='barh', ylabel='Total Points', xlabel='Teams', color='magenta', figsize=(10,10),\
        title= 'Bar Plot of Total Score versus Team \n')

<matplotlib.axes._subplots.AxesSubplot at 0x7fc25e98bf50>


# Sum of Points for every year acros all teams
df.groupby('Year')['Points'].sum().reset_index().\
plot('Year', 'Points', kind='line',ylabel='Total Points', xlabel='Year', \
     color='magenta', figsize=(10,6),title= 'Line Plot of Points versus Year \n')

<matplotlib.axes._subplots.AxesSubplot at 0x7fc25e93b890>


df.hist('Points', bins=15, legend=True, figsize=(10,6))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7fc25e68cd50>]],
      dtype=object)

	count	mean	std	min	25%	50%	75%	max
Team
Clippers	2.0	768.00	134.350288	673.0	720.50	768.0	815.50	863.0
Lakers	2.0	752.50	72.831998	701.0	726.75	752.5	778.25	804.0
Nets	4.0	762.25	88.567771	690.0	693.00	741.5	810.75	876.0
Warriors	4.0	774.25	31.899582	741.0	752.25	772.0	794.00	812.0

6. Pandas: Wrangling Operations and Data Visualization¶

6.1 Joining Data Frames

6.1.1 Merge

6.1.1.2 inner join

6.1.1.3 left join

6.1.1.4 right join

6.1.1.5 outer join

6.1.2 Concat

6.2 Applying functions & aggregation:

6.2.1 `.groupby()`

6.2.1.1 `.agg()`

6.2.2 `.crosstab()`

6.2.3 `.pivot()`

6.3 Pandas `.apply()`

6.3.1 Pandas `.apply()` on single column

6.3.2 Pandas `.apply()` on > 1 column

6.4 Statistical Functions in `pandas`

6.5 Plots & Graphs in `pandas`

6.5.1 Scatter Plot

6.5.2 Bar Graph

6.5.3 Line Graph

6.5.4 Histogram Graph

	key	dim1	values
0	a	grp1	69.967142
1	b	grp2	63.617357
2	c	grp2	71.476885
3	d	grp3	80.230299
4	e	grp3	62.658466

	key	dim1	values_x	dim2	values_y
0	a	grp1	69.967142	aaa	138.0
1	b	grp2	63.617357	aaa	128.0
2	c	grp2	71.476885	bbb	114.0
3	d	grp3	80.230299	NaN	NaN
4	e	grp3	62.658466	NaN	NaN

	min_points	max_points	mean_Rank
Team
Clippers	673	863	2.50
Lakers	701	804	2.50
Nets	690	876	1.75
Warriors	741	812	2.25

	Team	Rank	Year	Points
0	Nets	1	2014	876
1	Nets	2	2015	789
2	Clippers	2	2014	863
3	Clippers	3	2015	673
4	Warriors	3	2014	741
5	Warriors	4	2015	812
6	Warriors	1	2016	756
7	Warriors	1	2017	788
8	Nets	2	2016	694
9	Lakers	4	2014	701
10	Lakers	1	2015	804
11	Nets	2	2017	690

	Team	Rank	Year	Points	Scr_flg
0	Nets	1	2014	876	High
1	Nets	2	2015	789	Low
2	Clippers	2	2014	863	High
3	Clippers	3	2015	673	Low
4	Warriors	3	2014	741	Low
5	Warriors	4	2015	812	High
6	Warriors	1	2016	756	Low
7	Warriors	1	2017	788	Low
8	Nets	2	2016	694	Low
9	Lakers	4	2014	701	Low
10	Lakers	1	2015	804	Low
11	Nets	2	2017	690	Low

	bool_flag	dollars	category
a	True	100.0	A
b	False	110.0	B
c	True	120.0	C
d	True	130.0	B
e	False	75.0	A

6. Pandas: Wrangling Operations and Data Visualization¶

6.1 Joining Data Frames

6.1.1 Merge

6.1.1.2 inner join

6.1.1.3 left join

6.1.1.4 right join

6.1.1.5 outer join

6.1.2 Concat

6.2 Applying functions & aggregation:

6.2.1 .groupby()

6.2.1.1 .agg()

6.2.2 .crosstab()

6.2.3 .pivot()

6.3 Pandas .apply()

6.3.1 Pandas .apply() on single column

6.3.2 Pandas .apply() on > 1 column

6.4 Statistical Functions in pandas

6.5 Plots & Graphs in pandas

6.5.1 Scatter Plot

6.5.2 Bar Graph

6.5.3 Line Graph

6.5.4 Histogram Graph

6.2.1 `.groupby()`

6.2.1.1 `.agg()`

6.2.2 `.crosstab()`

6.2.3 `.pivot()`

6.3 Pandas `.apply()`

6.3.1 Pandas `.apply()` on single column

6.3.2 Pandas `.apply()` on > 1 column

6.4 Statistical Functions in `pandas`

6.5 Plots & Graphs in `pandas`